This is the same notebook as 100_species, but we look at the rottnest queryset. This should be slightly harder. # Setup
knitr::opts_chunk$set(warning = FALSE)
source(here::here('code', 'helpers.R'))
library(tidyverse)
library(forcats)
library(cowplot)
library(ggupset)
library(RColorBrewer)
library(patchwork)
library(vegan)
library(pheatmap)
library(broom)
data <- targets::tar_read(merged_all_results)
# rename BLAST to BLAST97 to differentiate from BLAST100 (percentage identity in both cases)
data <- data |> mutate(Type = str_replace(Type, '^BLAST$', 'BLAST97'))
truth <- targets::tar_read(truth_set_data)
table(data$Type)
##
## BLAST100 BLAST97 Kraken_0.0 Kraken_0.05 Kraken_0.1 Metabuli
## 9025 13893 82560 82560 55040 32545
## MMSeqs2 Mothur NBC Qiime2 TNT VSEARCH
## 89856 87048 98280 13229 74412 13331
Let’s remove the >0.2 Kraken runs, those are too strict
data <- data |> filter(!Type %in% c('Kraken_0.2', 'Kraken_0.3', 'Kraken_0.4', 'Kraken_0.5', 'Kraken_0.6', 'Kraken_0.7', 'Kraken_0.8', 'Kraken_0.9'))
Made a mistake- Metabuli’s and TNT’s databases is misspelled
data <- data |> mutate(Subject = str_replace_all(Subject, pattern = '_ref.fasta', replacement=''))
data <- data |> mutate(Subject = str_replace_all(Subject, pattern = 'final.csv', replacement = 'final.fasta'))
table(data$Query)
##
## KWest_16S_PooledTrue.fa
## 161697
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa
## 59339
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_noLulu_RESULTS_dada2_asv.fa
## 59339
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa
## 57248
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_noLulu_RESULTS_dada2_asv.fa
## 57248
## make_12s_16s_simulated_reads_6-fakeGenes_GreenGenes_RESULTS_dada2_asv.fa
## 50021
## make_12s_16s_simulated_reads_6-fakeGenes_Random_RESULTS_dada2_asv.fa
## 50505
## make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_12S_RESULTS_dada2_asv.fa
## 14656
## make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_16S_RESULTS_dada2_asv.fa
## 15664
## make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa
## 61169
## make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa
## 64893
table(data$Subject)
##
## 12s_v010_final.fasta
## 9731
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta
## 9234
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta
## 9091
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta
## 9187
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta
## 9253
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta
## 9127
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta
## 9258
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta
## 8978
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta
## 9032
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta
## 9377
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta
## 9334
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta
## 9516
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta
## 9342
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta
## 9330
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta
## 9517
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta
## 9385
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta
## 9371
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta
## 9270
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta
## 9401
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta
## 9300
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta
## 9413
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta
## 8765
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta
## 8774
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta
## 8988
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta
## 8878
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta
## 8986
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta
## 8969
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta
## 8712
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta
## 8793
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta
## 8747
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta
## 8973
## 12S_v10_HmmCut.fasta
## 7204
## 16S_v04_final.fasta
## 9457
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta
## 9648
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta
## 9422
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta
## 9882
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta
## 9881
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta
## 9850
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta
## 10072
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta
## 9597
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta
## 10048
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta
## 9482
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta
## 9636
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta
## 8897
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta
## 8983
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta
## 8838
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta
## 8622
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta
## 8831
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta
## 9114
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta
## 8716
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta
## 9044
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta
## 9003
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta
## 9011
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta
## 9168
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta
## 9422
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta
## 9369
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta
## 9292
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta
## 9718
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta
## 9219
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta
## 9045
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta
## 9342
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta
## 9221
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta
## 9415
## 16S_v04_HmmCut.fasta
## 7834
## c01_v03_final.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta
## 3124
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta
## 3124
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta
## 3124
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta
## 3124
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta
## 3124
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta
## 3124
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta
## 1720
## c01_v03_HmmCut.fasta
## 1720
table(data$Subject)
##
## 12s_v010_final.fasta
## 9731
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta
## 9234
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta
## 9091
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta
## 9187
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta
## 9253
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta
## 9127
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta
## 9258
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta
## 8978
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta
## 9032
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta
## 9377
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta
## 9334
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta
## 9516
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta
## 9342
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta
## 9330
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta
## 9517
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta
## 9385
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta
## 9371
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta
## 9270
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta
## 9401
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta
## 9300
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta
## 9413
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta
## 8765
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta
## 8774
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta
## 8988
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta
## 8878
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta
## 8986
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta
## 8969
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta
## 8712
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta
## 8793
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta
## 8747
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta
## 8973
## 12S_v10_HmmCut.fasta
## 7204
## 16S_v04_final.fasta
## 9457
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta
## 9648
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta
## 9422
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta
## 9882
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta
## 9881
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta
## 9850
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta
## 10072
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta
## 9597
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta
## 10048
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta
## 9482
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta
## 9636
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta
## 8897
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta
## 8983
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta
## 8838
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta
## 8622
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta
## 8831
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta
## 9114
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta
## 8716
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta
## 9044
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta
## 9003
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta
## 9011
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta
## 9168
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta
## 9422
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta
## 9369
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta
## 9292
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta
## 9718
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta
## 9219
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta
## 9045
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta
## 9342
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta
## 9221
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta
## 9415
## 16S_v04_HmmCut.fasta
## 7834
## c01_v03_final.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta
## 3124
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta
## 3124
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta
## 3124
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta
## 3124
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta
## 3124
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta
## 3124
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta
## 1720
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta
## 1720
## c01_v03_HmmCut.fasta
## 1720
twelveS_data <- data |> filter(Subject == '12s_v010_final.fasta')
sixteenS_data <- data |> filter(Subject == '16S_v04_final.fasta')
table(twelveS_data$Query)
##
## KWest_16S_PooledTrue.fa
## 2371
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa
## 1046
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_noLulu_RESULTS_dada2_asv.fa
## 1046
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa
## 744
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_noLulu_RESULTS_dada2_asv.fa
## 744
## make_12s_16s_simulated_reads_6-fakeGenes_GreenGenes_RESULTS_dada2_asv.fa
## 693
## make_12s_16s_simulated_reads_6-fakeGenes_Random_RESULTS_dada2_asv.fa
## 700
## make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_12S_RESULTS_dada2_asv.fa
## 253
## make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_16S_RESULTS_dada2_asv.fa
## 211
## make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa
## 1079
## make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa
## 844
table(sixteenS_data$Query)
##
## KWest_16S_PooledTrue.fa
## 2766
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa
## 720
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_noLulu_RESULTS_dada2_asv.fa
## 720
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa
## 922
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_noLulu_RESULTS_dada2_asv.fa
## 922
## make_12s_16s_simulated_reads_6-fakeGenes_GreenGenes_RESULTS_dada2_asv.fa
## 594
## make_12s_16s_simulated_reads_6-fakeGenes_Random_RESULTS_dada2_asv.fa
## 600
## make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_12S_RESULTS_dada2_asv.fa
## 178
## make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_16S_RESULTS_dada2_asv.fa
## 236
## make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa
## 738
## make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa
## 1061
table(sixteenS_data$Subject)
##
## 16S_v04_final.fasta
## 9457
twelveS_data_vs_12S_100 <- twelveS_data |>
filter(Query == 'make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa')
sixteenS_data_vs_16S_100 <- sixteenS_data |>
filter(Query == 'make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa' )
twelveS_data_vs_12S_100 |> select(Type, species) |> filter(species != 'dropped' &
!is.na(species)) |>
group_by(Type) |> count(species) |> summarise(n = n()) |>
ggplot(aes(x = Type, y = n, fill = n)) + geom_col() + coord_flip() +
theme_minimal() +
ylab('Count') +
ggtitle('12S: Species-level hits per classifier')
twelveS_data_vs_12S_100 |> select(Type, genus) |> filter(genus != 'dropped' &
!is.na(genus)) |>
group_by(Type) |> count(genus) |> summarise(n = n()) |>
ggplot(aes(x = Type, y = n, fill = n)) + geom_col() + coord_flip() +
theme_minimal() +
ylab('Count') +
ggtitle('12S: Genus-level hits per classifier')
twelveS_truth <- truth |> filter(Query == 'make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa') |> select(OTU, family, species) |> rename(True_OTU = OTU, True_family = family, True_species = species)
head(twelveS_truth)
## # A tibble: 6 × 3
## True_OTU True_family True_species
## <chr> <chr> <chr>
## 1 ASV_1 Syngnathidae Phyllopteryx taeniolatus
## 2 ASV_2 Syngnathidae Phycodurus eques
## 3 ASV_3 Syngnathidae Phyllopteryx taeniolatus
## 4 ASV_4 Labridae dropped
## 5 ASV_5 Alopiidae Isurus oxyrinchus
## 6 ASV_6 Carangidae Seriola lalandi
twelveS_data_vs_12S_100 |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
mutate(Correct = True_species == species) |>
filter(species != 'dropped' & !is.na(species)) |>
group_by(Type) |> count(Correct) |>
ggplot(aes(x = fct_rev(fct_reorder2(Type, Correct, n)), fill = Correct, y = n))+ geom_col() +
coord_flip() + theme_minimal() + xlab('Type') +
ggtitle('12S: Correct and incorrect species-level classifications (absolute)') +
scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7"))
cols <- c('Correct species' = "#009E73", 'Correct genus'="#56B4E9", 'Correct family' = "#0072B2", 'Incorrect family' = "#E69F00", 'Incorrect genus'="#F0E442", 'Incorrect species'="#D55E00", 'No hit'= "#CC79A7")
twelve_s_relative_plot <- twelveS_data_vs_12S_100 |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type) |>
count(CorrectSpecies) |>
mutate(total = sum(n, na.rm=TRUE)) |>
mutate(missing = 102 - total) |>
group_modify(~ add_row(.x)) |>
group_modify(~ {
.x |> mutate(new_col= max(missing, na.rm=TRUE)) |>
mutate(n = case_when(is.na(CorrectSpecies) & is.na(missing) ~ new_col,
TRUE ~ n)) |>
select(-new_col)
} ) |>
mutate(total = 102) |>
mutate(perc = n / total * 100) |>
mutate(CorrectSpecies = replace_na(CorrectSpecies, 'No hit')) |>
mutate(CorrectSpecies = factor(CorrectSpecies, rev(c('Correct species', 'Correct genus', 'Correct family', 'Incorrect family', 'Incorrect genus', 'Incorrect species', 'No hit')))) |>
ggplot(aes(x = fct_rev(fct_reorder2(Type, CorrectSpecies, n)), fill = CorrectSpecies, y = perc))+
geom_col() +
coord_flip() +
theme_minimal() +
ylab('Percentage') + xlab('Type') +
ggtitle('12S: Correct and incorrect species-level classifications (relative)') +
scale_fill_manual(name = 'Outcome', values = cols, breaks=names(cols))
twelve_s_relative_plot
### Calculate Upset-based species sightings
type_list <- twelveS_data_vs_12S_100 |> select(Type, species) |> unique() |> filter(!is.na(species) & species != 'dropped') |>
group_by(species) |>
summarize('Type' = list(Type))
a <- type_list |>
ggplot(aes(x = Type)) +
geom_bar() +
scale_x_upset(n_intersections = 12) +
ggtitle('12S: Shared species') +
ylab('Species')
a
type_list <- twelveS_data_vs_12S_100 |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
filter(species == True_species) |>
filter(species != 'dropped' & !is.na(species)) |>
select(Type, species) |> unique() |>
group_by(species) |>
summarize('Type' = list(Type))
b <- type_list |>
ggplot(aes(x = Type)) +
geom_bar() +
scale_x_upset(n_intersections = 12) +
ggtitle('12S: Shared correct species') +
ylab('Species')
b
type_list <- twelveS_data_vs_12S_100 |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
filter(species != True_species) |>
filter(species != 'dropped' & !is.na(species)) |>
select(Type, species) |> unique() |>
group_by(species) |>
summarize('Type' = list(Type))
c <- type_list |>
ggplot(aes(x = Type)) +
geom_bar() +
scale_x_upset(n_intersections = 12) +
ggtitle('12S: Shared incorrect species') +
ylab('Species')
c
a + b + c & ylim(c(0, 30)) &
theme(
# Hide panel borders and remove grid lines
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank(),
panel.grid.minor = element_blank(),
#panel.grid.major.y = element_line(),
# Change axis line
axis.line = element_line(colour = "black")
)
add_scores <- function(twelveS_data_vs_12S_100_with_MaxTruth, twelveS_truth ) {
twelveS_data_vs_12S_100_with_MaxTruth|> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type) |>
summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
FN = sum(is.na(species) & !is.na(True_species))) |>
mutate(sums = TP + FP + TN + FN) |>
mutate(missing = 102 - sums) |>
mutate(FN = FN + missing) |>
mutate(sums = TP + FP + TN + FN) |>
select(-c(missing, sums))
}
scores <- add_scores(twelveS_data_vs_12S_100, twelveS_truth)
scores <- scores |> rowwise() |> mutate(recall = recall(TP, FN), precision = precision(TP, FP),
f1 = f1(precision, recall), f0.5 = f0.5(precision, recall), accuracy = accuracy(TP, FP, FN, TN))
scores
## # A tibble: 12 × 10
## # Rowwise:
## Type TP FP TN FN recall precision f1 f0.5 accuracy
## <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 BLAST100 57 2 0 43 0.57 0.966 0.717 0.848 0.559
## 2 BLAST97 59 8 0 35 0.628 0.881 0.733 0.815 0.578
## 3 Kraken_0.0 60 12 0 30 0.667 0.833 0.741 0.794 0.588
## 4 Kraken_0.05 57 9 0 36 0.613 0.864 0.717 0.798 0.559
## 5 Kraken_0.1 53 5 0 44 0.546 0.914 0.684 0.805 0.520
## 6 MMSeqs2 64 7 0 31 0.674 0.901 0.771 0.844 0.627
## 7 Metabuli 40 6 0 56 0.417 0.870 0.563 0.714 0.392
## 8 Mothur 42 22 0 38 0.525 0.656 0.583 0.625 0.412
## 9 NBC 55 18 0 29 0.655 0.753 0.701 0.731 0.539
## 10 Qiime2 31 18 0 53 0.369 0.633 0.466 0.554 0.304
## 11 TNT 41 8 0 53 0.436 0.837 0.573 0.707 0.402
## 12 VSEARCH 52 17 0 33 0.612 0.754 0.675 0.720 0.510
twelveS_scoreS_plot <- scores |> select(-c(TP, FP, FN, TN)) |> pivot_longer(-Type, names_to='Score') |> ggplot(aes(x = fct_rev(fct_reorder(Type, value)), y = value, group=Score, color = Score, fill =Score)) + geom_line() + ylim(c(0, 1)) + theme_minimal_hgrid()+ theme(axis.text.x = element_text( angle = 45, hjust = 1)) + ylab('Score') + xlab('Tool') + ggtitle('12S scores')
twelveS_scoreS_plot
Let’s also make a heatmap from that
b <- scores$Type
m <- scores |> select(-Type) |> select(accuracy, recall, precision, f1, f0.5) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
color = colorRampPalette(brewer.pal(n = 7, name =
"RdYlGn"))(100))
b <- scores$Type
m <- scores |> select(-Type) |> select(TP, FP, FN) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
color = colorRampPalette(brewer.pal(n = 7, name =
"RdYlGn"))(100))
table(twelveS_data_vs_12S_100$Type)
##
## BLAST100 BLAST97 Kraken_0.0 Kraken_0.05 Kraken_0.1 Metabuli
## 79 93 102 102 102 75
## MMSeqs2 Mothur NBC Qiime2 TNT VSEARCH
## 102 102 102 49 102 69
First, we count the per-OTU species hits
twelveS_data_vs_12S_100_maxCount <- twelveS_data_vs_12S_100 |>
mutate(species = na_if(species, 'dropped')) |>
filter(!is.na(species)) |>
#filter(! Type %in% c('Mothur', 'VSEARCH', 'Kraken_0.2', 'Qiime2', 'Metabuli', 'NBC', 'BLAST97', 'Kraken_0.0', 'Kraken_0.1')) |>
group_by(Query, Subject, OTU) |>
count(species) |>
# double check the truth
#left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
#mutate(Truth = True_species == species) |>
# pull out the per-group highest n
filter( n > 4) |>
slice_max(n, n=1, with_ties = FALSE) |>
mutate(Type = 'MaxCount', .before = 'Query') |>
select(-n)
twelveS_data_vs_12S_100_maxCount
## # A tibble: 73 × 5
## # Groups: Query, Subject, OTU [73]
## Type Query Subject OTU species
## <chr> <chr> <chr> <chr> <chr>
## 1 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_1 Phyllo…
## 2 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Carcha…
## 3 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Isurus…
## 4 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Acanth…
## 5 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Pseudo…
## 6 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Parupe…
## 7 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Sphyrn…
## 8 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Parupe…
## 9 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Chaeto…
## 10 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Dascyl…
## # ℹ 63 more rows
twelveS_data_vs_12S_100_with_MaxTruth <- twelveS_data_vs_12S_100 |>
bind_rows(twelveS_data_vs_12S_100_maxCount) #|>
#filter(! Type %in% c('Mothur', 'VSEARCH', 'Kraken_0.2', 'Qiime2', 'Metabuli', 'NBC', 'BLAST97', 'Kraken_0.0', 'Kraken_0.1'))
maxTruth_scores <- add_scores(twelveS_data_vs_12S_100_with_MaxTruth, twelveS_truth )
maxTruth_scores <- maxTruth_scores |> rowwise() |> mutate(recall = recall(TP, FN), precision = precision(TP, FP),
f1 = f1(precision, recall), f0.5 = f0.5(precision, recall), accuracy = accuracy(TP, FP, FN, TN))
maxTruth_scoreS_plot <- maxTruth_scores |> select(-c(TP, FP, FN, TN)) |> pivot_longer(-Type, names_to='Score') |> ggplot(aes(x = fct_rev(fct_reorder(Type, value)), y = value, group=Score, color = Score, fill =Score)) + geom_line() + ylim(c(0, 1)) + theme_minimal_hgrid()+ theme(axis.text.x = element_text( angle = 45, hjust = 1)) + ylab('Score') + xlab('Tool') + geom_point() + ggtitle('12S scores')
maxTruth_scoreS_plot
With the Rottnest data MaxCount performs slightly better! But still not
as good as MMSeqs2….
sixteenS_truth <- truth |> filter(Query == 'make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa') |> select(OTU, family, species) |> rename(True_OTU = OTU, True_family = family, True_species = species)
tail(sixteenS_truth)
## # A tibble: 6 × 3
## True_OTU True_family True_species
## <chr> <chr> <chr>
## 1 ASV_106 Alopiidae Isurus oxyrinchus
## 2 ASV_107 Carangidae Seriola dumerili
## 3 ASV_108 Carcharhinidae Sphyrna lewini
## 4 ASV_109 Scombridae Auxis thazard
## 5 ASV_110 Syngnathidae Phycodurus eques
## 6 ASV_111 Syngnathidae Phyllopteryx taeniolatus
sixteenS_relative_plot <- sixteenS_data_vs_16S_100 |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type) |>
count(CorrectSpecies) |>
mutate(total = sum(n)) |>
mutate(missing = 112 - total) |>
group_modify(~ add_row(.x)) |>
group_modify(~ {
.x |> mutate(new_col= max(missing, na.rm=TRUE)) |>
mutate(n = case_when(is.na(CorrectSpecies) & is.na(missing) ~ new_col,
TRUE ~ n)) |>
select(-new_col)
} ) |>
mutate(total = 112) |>
mutate(perc = n / total * 100) |>
mutate(CorrectSpecies = replace_na(CorrectSpecies, 'No hit')) |>
mutate(CorrectSpecies = factor(CorrectSpecies, rev(c('Correct species', 'Correct genus', 'Correct family', 'Incorrect family', 'Incorrect genus', 'Incorrect species', 'No hit')))) |>
tidyr::complete(CorrectSpecies, fill = list(n=0, total = 112, missing = NA, perc = 0)) |>
ggplot(aes(x = fct_rev(fct_reorder2(Type, CorrectSpecies, n)), fill = CorrectSpecies, y = perc))+
geom_col() +
coord_flip() +
theme_minimal() +
ylab('Percentage') + xlab('Type') +
ggtitle('16S: Correct and incorrect species-level classifications (relative)') +
scale_fill_manual(name = 'Outcome', values = cols, breaks=names(cols))
sixteenS_relative_plot
scores <- add_scores(sixteenS_data_vs_16S_100, sixteenS_truth)
scores
## # A tibble: 11 × 5
## Type TP FP TN FN
## <chr> <int> <int> <int> <dbl>
## 1 BLAST100 53 3 0 46
## 2 BLAST97 62 6 2 32
## 3 Kraken_0.0 70 16 2 14
## 4 Kraken_0.05 69 11 2 20
## 5 Kraken_0.1 66 8 2 26
## 6 MMSeqs2 69 8 2 23
## 7 Metabuli 40 2 1 59
## 8 Mothur 61 27 2 12
## 9 NBC 56 20 2 24
## 10 Qiime2 56 20 1 25
## 11 VSEARCH 68 11 1 22
scores <- scores |> rowwise() |> mutate(recall = recall(TP, FN), precision = precision(TP, FP),
f1 = f1(precision, recall), f0.5 = f0.5(precision, recall), accuracy = accuracy(TP, FP, FN, TN))
scores
## # A tibble: 11 × 10
## # Rowwise:
## Type TP FP TN FN recall precision f1 f0.5 accuracy
## <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 BLAST100 53 3 0 46 0.535 0.946 0.684 0.820 0.520
## 2 BLAST97 62 6 2 32 0.660 0.912 0.765 0.847 0.627
## 3 Kraken_0.0 70 16 2 14 0.833 0.814 0.824 0.818 0.706
## 4 Kraken_0.05 69 11 2 20 0.775 0.862 0.817 0.844 0.696
## 5 Kraken_0.1 66 8 2 26 0.717 0.892 0.795 0.851 0.667
## 6 MMSeqs2 69 8 2 23 0.75 0.896 0.817 0.862 0.696
## 7 Metabuli 40 2 1 59 0.404 0.952 0.567 0.749 0.402
## 8 Mothur 61 27 2 12 0.836 0.693 0.758 0.718 0.618
## 9 NBC 56 20 2 24 0.7 0.737 0.718 0.729 0.569
## 10 Qiime2 56 20 1 25 0.691 0.737 0.713 0.727 0.559
## 11 VSEARCH 68 11 1 22 0.756 0.861 0.805 0.837 0.676
sixteenS_score_plot <- scores |> select(-c(TP, FP, FN, TN)) |> pivot_longer(-Type, names_to='Score') |> ggplot(aes(x = fct_rev(fct_reorder(Type, value)), y = value, group=Score, color = Score, fill =Score)) + geom_line() + ylim(c(0, 1)) + theme_minimal_hgrid() + theme(axis.text.x = element_text( angle = 45, hjust = 1)) + ylab('Score') + xlab('Tool') + ggtitle('16S scores')
sixteenS_score_plot
b <- scores$Type
m <- scores |> select(-Type) |> select(accuracy, recall, precision, f1, f0.5) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
color = colorRampPalette(brewer.pal(n = 7, name =
"RdYlGn"))(100))
b <- scores$Type
m <- scores |> select(-Type) |> select(TP, FP, FN) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
color = colorRampPalette(brewer.pal(n = 7, name =
"RdYlGn"))(100))
### Calculate Upset-based species sightings
type_list <- sixteenS_data_vs_16S_100 |> select(Type, species) |> unique() |> filter(!is.na(species) & species != 'dropped') |>
group_by(species) |>
summarize('Type' = list(Type))
a <- type_list |>
ggplot(aes(x = Type)) +
geom_bar() +
scale_x_upset(n_intersections = 12) +
ggtitle('16S: Shared species') +
ylab('Species')
a
type_list <- sixteenS_data_vs_16S_100 |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
filter(species == True_species) |>
filter(species != 'dropped' & !is.na(species)) |>
select(Type, species) |> unique() |>
group_by(species) |>
summarize('Type' = list(Type))
b <- type_list |>
ggplot(aes(x = Type)) +
geom_bar() +
scale_x_upset(n_intersections = 12) +
ggtitle('16S: Shared correct species') +
ylab('Species')
b
type_list <- sixteenS_data_vs_16S_100 |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
filter(species != True_species) |>
filter(species != 'dropped' & !is.na(species)) |>
select(Type, species) |> unique() |>
group_by(species) |>
summarize('Type' = list(Type))
c <- type_list |>
ggplot(aes(x = Type)) +
geom_bar() +
scale_x_upset(n_intersections = 12) +
ggtitle('16S: Shared incorrect species') +
ylab('Species')
c
a + b + c & ylim(c(0, 20)) &
theme(
# Hide panel borders and remove grid lines
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank(),
panel.grid.minor = element_blank(),
#panel.grid.major.y = element_line(),
# Change axis line
axis.line = element_line(colour = "black")
)
sixteenS_relative_plot / twelve_s_relative_plot
Let’s make without titles, but with a/b
(sixteenS_relative_plot + ggtitle('') + ylab(''))/ (twelve_s_relative_plot + ggtitle('')) +
plot_annotation(tag_levels = c('A','B')) +
plot_layout(guides = 'collect')
(sixteenS_score_plot +geom_point() + theme(axis.title.x = element_blank()))/ (twelveS_scoreS_plot + geom_point())
twelve_exclusions <- data |> filter(str_starts(Subject, '12s_v010_final.fasta_Taxonomies.CountedFams.txt_')) |>
filter(Query == 'make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa')
table(twelve_exclusions$Subject)
##
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta
## 942
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta
## 946
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta
## 962
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta
## 939
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta
## 913
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta
## 947
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta
## 921
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta
## 929
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta
## 977
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta
## 954
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta
## 990
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta
## 967
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta
## 959
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta
## 984
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta
## 991
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta
## 1021
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta
## 953
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta
## 1022
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta
## 958
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta
## 975
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta
## 877
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta
## 860
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta
## 859
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta
## 832
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta
## 827
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta
## 877
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta
## 829
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta
## 844
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta
## 856
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta
## 906
twelve_exclusions_split <- twelve_exclusions |> separate(Subject, into = c('before', 'hit'), sep='.txt_') |>
# get rid of leftover non-subsetted databases
filter(!is.na(hit)) |>
separate(hit, into=c('Database', 'after'), sep='_subset')
twelve_exclusions_split_averaged <- twelve_exclusions_split |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type, Database, after) |>
summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
FN = sum(is.na(species) & !is.na(True_species))) |>
mutate(sums = TP + FP + TN + FN) |>
mutate(missing = 102 - sums) |>
mutate(FN = FN + missing) |>
mutate(sums = TP + FP + TN + FN) |>
select(-c(missing, sums)) |>
group_by(Type, Database) |>
summarise(mean_TP = mean(TP),
mean_FP = mean(FP),
mean_TN = mean(TN),
mean_FN = mean(FN)) |>
rowwise() |>
mutate(recall = recall(mean_TP, mean_FN),
precision = precision(mean_TP, mean_FP),
f1 = f1(precision, recall),
f0.5 = f0.5(precision, recall),
accuracy = accuracy(mean_TP, mean_FP, mean_FN, mean_TN))
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.
twelve_exclusions_split_averaged <- twelve_exclusions_split_averaged |>
mutate(Database = case_when ( Database == 'fifty' ~ '50%',
Database == 'seventy' ~ '70%',
Database == 'thirty' ~ '30%',
TRUE ~ Database))
f1_pl <- twelve_exclusions_split_averaged |>
ggplot(aes(x = Type, y = f1, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
f0.5_pl <- twelve_exclusions_split_averaged |>
ggplot(aes(x = Type, y = f0.5, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
precision_pl <- twelve_exclusions_split_averaged |>
ggplot(aes(x = Type, y = precision, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
recall_pl <- twelve_exclusions_split_averaged |>
ggplot(aes(x = Type, y = recall, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
(f1_pl / f0.5_pl / precision_pl / recall_pl) + plot_layout(guides = 'collect')
Lets zero in on the precision and make boxplots with jitter dots
un_summarised_twelve <- twelve_exclusions_split |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type, Database, after) |>
summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
FN = sum(is.na(species) & !is.na(True_species))) |>
mutate(sums = TP + FP + TN + FN) |>
mutate(missing = 102 - sums) |>
mutate(FN = FN + missing) |>
mutate(sums = TP + FP + TN + FN) |>
select(-c(missing, sums)) |>
rowwise() |>
mutate(recall = recall(TP, FN),
precision = precision(TP, FP),
f1 = f1(precision, recall),
f0.5 = f0.5(precision, recall),
accuracy = accuracy(TP, FP, FN, TN)) |>
mutate(Database = case_when ( Database == 'fifty' ~ '50%',
Database == 'seventy' ~ '70%',
Database == 'thirty' ~ '30%',
TRUE ~ Database))
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
un_summarised_twelve |> group_by(Type, Database) |> mutate(best = max(mean(precision))) |>
ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = precision)) +
geom_boxplot(outlier.shape = NA) +
coord_flip() +
theme_minimal() +
xlab('Type') +
ylab('Precision') +
geom_point(position = position_jitterdodge(), alpha=0.5)
un_summarised_twelve |> group_by(Type, Database) |> mutate(best = max(mean(f0.5))) |>
ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = f0.5)) +
geom_boxplot(outlier.shape = NA) +
coord_flip() +
theme_minimal() +
xlab('Type') +
ylab('f0.5') +
ylim(c(0, 1)) +
geom_point(position = position_jitterdodge(), alpha=0.5)
un_summarised_twelve |> group_by(Type, Database) |> mutate(best = max(mean(recall))) |>
ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = recall)) +
geom_boxplot(outlier.shape = NA) +
coord_flip() +
theme_minimal() +
xlab('Type') +
ylab('f0.5') +
ylim(c(0, 1)) +
geom_point(position = position_jitterdodge(), alpha=0.5)
un_summarised_twelve |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Kraken_0.05', 'Qiime2', 'TNT')) |>
ggplot(aes(x=Database, y = precision, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) +
geom_boxplot() +
labs(fill='Type') +
ylab('Precision') +
theme_minimal()
false_positives <- un_summarised_twelve |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Kraken_0.05', 'Kraken_0.1', 'MMSeqs2', 'TNT')) |>
ggplot(aes(x=Database, y = FP/102*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) +
geom_boxplot(outlier.shape=NA) +
labs(fill='Type') +
ylab('False positives (%)') +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE)+
theme_minimal()
false_positives
true_positives <- un_summarised_twelve |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Kraken_0.05', 'Kraken_0.1', 'MMSeqs2', 'TNT')) |>
ggplot(aes(x=Database, y = TP/102*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) +
geom_boxplot(outlier.shape=NA) +
labs(fill='Type') +
ylab('True positives (%)') +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE)+
theme_minimal()
true_positives
false_positives/ true_positives + plot_layout(guides = 'collect') & coord_flip()
### Phylogenetic diversity
We can also easily calculate alpha diversity across these tools, as alpha diversity is the number of species. We treat classifiers/Types as sites.
spec_summarised <- twelve_exclusions_split |>
group_by(Type, Query, Database, after) |>
mutate(Database = case_when ( Database == 'fifty' ~ '50%',
Database == 'seventy' ~ '70%',
Database == 'thirty' ~ '30%',
TRUE ~ Database)) |>
filter(!is.na(species)) |>
summarise(`Alpha diversity index` = length(unique(species)))
## `summarise()` has grouped output by 'Type', 'Query', 'Database'. You can
## override using the `.groups` argument.
spec_summarised |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot() +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) + coord_flip() + theme_minimal()
Let’s also do not all of the classifiers
spec_summarised |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Kraken_0.05', 'Kraken_0.1','MMSeqs2', 'Qiime2', 'TNT')) |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot() +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) + coord_flip() + theme_minimal()
a <- spec_summarised |>
filter(Type %in% c('BLAST100', 'BLAST97', 'Kraken_0.0','Kraken_0.05', 'Kraken_0.1','MMSeqs2', 'Qiime2', 'TNT')) |>
group_by(Database) |>
arrange(Database) |>
group_map(~aov(`Alpha diversity index` ~ Type, data=.))
names(a) <- spec_summarised |> arrange(Database) |> pull(Database) |> unique() # I don't like this :(
a
## $`30%`
## Call:
## aov(formula = `Alpha diversity index` ~ Type, data = .)
##
## Terms:
## Type Residuals
## Sum of Squares 4603.887 1552.100
## Deg. of Freedom 7 72
##
## Residual standard error: 4.642946
## Estimated effects may be unbalanced
##
## $`50%`
## Call:
## aov(formula = `Alpha diversity index` ~ Type, data = .)
##
## Terms:
## Type Residuals
## Sum of Squares 4080.95 970.60
## Deg. of Freedom 7 72
##
## Residual standard error: 3.671588
## Estimated effects may be unbalanced
##
## $`70%`
## Call:
## aov(formula = `Alpha diversity index` ~ Type, data = .)
##
## Terms:
## Type Residuals
## Sum of Squares 3226.35 1167.20
## Deg. of Freedom 7 72
##
## Residual standard error: 4.026302
## Estimated effects may be unbalanced
library(agricolae)
groupslist <- list()
for(key in names(a)) {
print(key)
groupslist[[key]] <- HSD.test(a[[key]], 'Type')$groups|>
as_tibble(rownames = 'Type') |>
select(-`Alpha diversity index`)
}
## [1] "30%"
## [1] "50%"
## [1] "70%"
groups_df <- bind_rows(groupslist, .id='Database')
spec_summarised |>
filter(Type %in% c('BLAST100', 'BLAST97', 'Kraken_0.0', 'Kraken_0.05', 'Kraken_0.1','MMSeqs2', 'Qiime2', 'TNT')) |>
left_join(groups_df, by = c('Database', 'Type')) |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot(outlier.shape=NA) +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) +
geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
#col = 'black',
size = 4) +
#coord_flip() +
theme_minimal() +
theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
guides(fill="none")
sixteen_exclusions <- data |> filter(str_starts(Subject, '16S_v04_final.fasta_Taxonomies.')) |>
filter(Query == 'make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa')
table(sixteen_exclusions$Subject)
##
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta
## 985
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta
## 973
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta
## 1063
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta
## 982
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta
## 983
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta
## 992
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta
## 1016
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta
## 1045
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta
## 955
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta
## 979
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta
## 979
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta
## 992
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta
## 955
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta
## 963
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta
## 906
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta
## 1014
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta
## 936
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta
## 960
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta
## 991
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta
## 993
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta
## 908
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta
## 936
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta
## 926
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta
## 902
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta
## 981
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta
## 918
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta
## 857
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta
## 968
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta
## 897
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta
## 917
sixteen_exclusions_split <- sixteen_exclusions |> separate(Subject, into = c('before', 'hit'), sep='.txt_') |>
# get rid of leftover non-subsetted databases
filter(!is.na(hit)) |>
separate(hit, into=c('Database', 'after'), sep='_subset')
sixteen_exclusions_split_averaged <- sixteen_exclusions_split |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type, Database, after) |>
summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
FN = sum(is.na(species) & !is.na(True_species))) |>
mutate(sums = TP + FP + TN + FN) |>
mutate(missing = 102 - sums) |>
mutate(FN = FN + missing) |>
mutate(sums = TP + FP + TN + FN) |>
select(-c(missing, sums)) |>
group_by(Type, Database) |>
summarise(mean_TP = mean(TP),
mean_FP = mean(FP),
mean_TN = mean(TN),
mean_FN = mean(FN)) |>
rowwise() |>
mutate(recall = recall(mean_TP, mean_FN),
precision = precision(mean_TP, mean_FP),
f1 = f1(precision, recall),
f0.5 = f0.5(precision, recall),
accuracy = accuracy(mean_TP, mean_FP, mean_FN, mean_TN))
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.
sixteen_exclusions_split_averaged <- sixteen_exclusions_split_averaged |>
mutate(Database = case_when ( Database == 'fifty' ~ '50%',
Database == 'seventy' ~ '70%',
Database == 'thirty' ~ '30%',
TRUE ~ Database))
f1_pl <- sixteen_exclusions_split_averaged |>
ggplot(aes(x = Type, y = f1, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
f0.5_pl <- sixteen_exclusions_split_averaged |>
ggplot(aes(x = Type, y = f0.5, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
precision_pl <- sixteen_exclusions_split_averaged |>
ggplot(aes(x = Type, y = precision, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
recall_pl <- sixteen_exclusions_split_averaged |>
ggplot(aes(x = Type, y = recall, group = Database, color = Database, fill = Database)) +
geom_point() +
geom_line() +
theme_minimal()
(f1_pl / f0.5_pl / precision_pl / recall_pl) + plot_layout(guides = 'collect')
Lets zero in on the precision and make boxplots with jitter dots
un_summarised_sixteen <- sixteen_exclusions_split |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|>
mutate(species = na_if(species, 'dropped')) |>
mutate(genus = na_if(genus, 'dropped')) |>
mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
!is.na(species) & True_species != species ~ 'Incorrect species',
!is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
!is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
!is.na(family) & True_family == family ~ 'Correct family',
!is.na(family) & True_family != family ~ 'Incorrect family',
TRUE ~ NA)) |>
group_by(Type, Database, after) |>
summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
FN = sum(is.na(species) & !is.na(True_species))) |>
mutate(sums = TP + FP + TN + FN) |>
mutate(missing = 112 - sums) |>
mutate(FN = FN + missing) |>
mutate(sums = TP + FP + TN + FN) |>
select(-c(missing, sums)) |>
rowwise() |>
mutate(recall = recall(TP, FN),
precision = precision(TP, FP),
f1 = f1(precision, recall),
f0.5 = f0.5(precision, recall),
accuracy = accuracy(TP, FP, FN, TN)) |>
mutate(Database = case_when ( Database == 'fifty' ~ '50%',
Database == 'seventy' ~ '70%',
Database == 'thirty' ~ '30%',
TRUE ~ Database))
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
un_summarised_sixteen |> group_by(Type, Database) |> mutate(best = max(mean(precision))) |>
ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = precision)) +
geom_boxplot(outlier.shape = NA) +
coord_flip() +
theme_minimal() +
xlab('Type') +
ylab('Precision') +
geom_point(position = position_jitterdodge(), alpha=0.5)
un_summarised_sixteen |> group_by(Type, Database) |> mutate(best = max(mean(f0.5, na.rm=TRUE))) |>
ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = f0.5)) +
geom_boxplot(outlier.shape = NA) +
coord_flip() +
theme_minimal() +
xlab('Type') +
ylab('f0.5') +
ylim(c(0, 1)) +
geom_point(position = position_jitterdodge(), alpha=0.5)
un_summarised_sixteen |> group_by(Type, Database) |> mutate(best = max(mean(recall))) |>
ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = recall)) +
geom_boxplot(outlier.shape = NA) +
coord_flip() +
theme_minimal() +
xlab('Type') +
ylab('f0.5') +
ylim(c(0, 1)) +
geom_point(position = position_jitterdodge(), alpha=0.5)
un_summarised_sixteen |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Kraken_0.05', 'Qiime2', 'TNT')) |>
ggplot(aes(x=Database, y = precision, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) +
geom_boxplot() +
labs(fill='Type') +
ylab('Precision') +
theme_minimal()
false_positives <- un_summarised_sixteen |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Kraken_0.05', 'Kraken_0.1', 'MMSeqs2', 'TNT')) |>
ggplot(aes(x=Database, y = FP/112*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) +
geom_boxplot(outlier.shape=NA) +
labs(fill='Type') +
ylab('False positives (%)') +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE)+
theme_minimal()
false_positives
true_positives <- un_summarised_sixteen |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Kraken_0.05', 'Kraken_0.1', 'MMSeqs2', 'TNT')) |>
ggplot(aes(x=Database, y = TP/112*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) +
geom_boxplot(outlier.shape=NA) +
labs(fill='Type') +
ylab('True positives (%)') +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE)+
theme_minimal()
true_positives
false_positives/ true_positives + plot_layout(guides = 'collect') & coord_flip()
### Phylogenetic diversity
We can also easily calculate alpha diversity across these tools, as alpha diversity is the number of species. We treat classifiers/Types as sites.
spec_summarised <- sixteen_exclusions_split |>
group_by(Type, Query, Database, after) |>
mutate(Database = case_when ( Database == 'fifty' ~ '50%',
Database == 'seventy' ~ '70%',
Database == 'thirty' ~ '30%',
TRUE ~ Database)) |>
filter(!is.na(species)) |>
summarise(`Alpha diversity index` = length(unique(species)))
## `summarise()` has grouped output by 'Type', 'Query', 'Database'. You can
## override using the `.groups` argument.
spec_summarised |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot() +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) + coord_flip() + theme_minimal()
Let’s also do not all of the classifiers
spec_summarised |>
filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Kraken_0.05', 'Kraken_0.1','MMSeqs2', 'Qiime2', 'TNT')) |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot() +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) + coord_flip() + theme_minimal()
a <- spec_summarised |>
filter(Type %in% c('BLAST100', 'BLAST97', 'Kraken_0.0', 'Kraken_0.05', 'Kraken_0.1','MMSeqs2', 'Qiime2', 'TNT')) |>
group_by(Database) |>
arrange(Database) |>
group_map(~aov(`Alpha diversity index` ~ Type, data=.))
names(a) <- spec_summarised |> arrange(Database) |> pull(Database) |> unique() # I don't like this :(
a
## $`30%`
## Call:
## aov(formula = `Alpha diversity index` ~ Type, data = .)
##
## Terms:
## Type Residuals
## Sum of Squares 10603.389 2043.131
## Deg. of Freedom 7 69
##
## Residual standard error: 5.441561
## Estimated effects may be unbalanced
##
## $`50%`
## Call:
## aov(formula = `Alpha diversity index` ~ Type, data = .)
##
## Terms:
## Type Residuals
## Sum of Squares 14582.09 3384.30
## Deg. of Freedom 7 72
##
## Residual standard error: 6.855958
## Estimated effects may be unbalanced
##
## $`70%`
## Call:
## aov(formula = `Alpha diversity index` ~ Type, data = .)
##
## Terms:
## Type Residuals
## Sum of Squares 4334.886 1085.700
## Deg. of Freedom 6 63
##
## Residual standard error: 4.151305
## Estimated effects may be unbalanced
library(agricolae)
groupslist <- list()
for(key in names(a)) {
print(key)
groupslist[[key]] <- HSD.test(a[[key]], 'Type')$groups|>
as_tibble(rownames = 'Type') |>
select(-`Alpha diversity index`)
}
## [1] "30%"
## [1] "50%"
## [1] "70%"
groups_df <- bind_rows(groupslist, .id='Database')
spec_summarised |>
filter(Type %in% c('BLAST100', 'BLAST97', 'Kraken_0.0', 'Kraken_0.05', 'Kraken_0.1','MMSeqs2', 'Qiime2', 'TNT')) |>
left_join(groups_df, by = c('Database', 'Type')) |>
ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) +
geom_boxplot(outlier.shape=NA) +
geom_point(aes(color=Type),
position = position_jitterdodge(jitter.width = 0.2),
alpha=0.5,
show.legend = FALSE) +
facet_wrap(~Database) +
geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
#col = 'black',
size = 4) +
#coord_flip() +
theme_minimal() +
theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
guides(fill="none")